preprocess test set to extract words and generate dictionaries


In [1]:
import pandas as pd
import numpy as np
import string
import time
from scipy.sparse import *
from scipy.io import mmwrite, mmread
import csv
from bs4 import BeautifulSoup
from nltk.tag import brill
from taggerfunctions import *
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from ast import literal_eval
import sklearn as sk

In [2]:
uselesssymbols = ['. ','\n',"'",'\"','(',')',',',';',':','?','!','&','$']
def tokenizeWords(entry):
    entryselect = []
    soup = BeautifulSoup(entry)
    for tag in soup.find_all(["pre", "code", "a", "img"]):
        tag.decompose()
    entry = soup.get_text().encode('ascii', 'ignore')
    for symbol in uselesssymbols:
        entry = entry.replace(symbol, ' ')
    entrytok = nltk.word_tokenize(entry)
    entrytok = [w.lower() for w in entrytok]
    return tag_pos(entrytok)

In [3]:
def tag_pos(entrytok):
    entryselect = []
    entrytoktag = braubt_tagger.tag(entrytok)
    for tok, tag in entrytoktag:
        if tag not in ('VBP', 'CC', 'CD', 'RB', 'TO', 'VB', 'DT', 'IN', 'PRP', 'VBZ', 'WDT', '-NONE-'):
            try:
                tok_lemmatized = lemmatizer.lemmatize(tok, get_wordnet_pos(tag))
            except:
                tok_lemmatized = lemmatizer.lemmatize(tok)
            entryselect.append(tok_lemmatized)
    return entryselect

In [4]:
from taggerfunctions import *

braubt_tagger = braubt_Tagger()

In [5]:
def getDict(fname):
    dictWords = {}
    with open(fname, 'r') as f:
        reader = csv.reader(f)
        dictWords = {rows[0]:literal_eval(rows[1]) for rows in reader}
    return pd.Series(dictWords)

In [6]:
fname = 'dictKeys.csv'
dictKeys = getDict(fname)
fname = 'dictWordsBodyFull.csv'
dictWordsBody = getDict(fname)
fname = 'dictWordsTitleNew.csv'
dictWordsTitle = getDict(fname)

In [42]:
reader = pd.read_csv("Test.csv", chunksize=100000)
IdTest = []
for chunk in reader:
    for idnum in chunk['Id']:
        IdTest.append(idnum)

In [45]:
for idx in range(10): 
    dictId = {qid:qidnum for qid,qidnum in zip(IdTest[idx*200000:(idx+1)*200000], range(200000))}
    invdictId = {qidnum:qid for qid,qidnum in zip(IdTest[idx*200000:(idx+1)*200000], range(200000))}
    dictId = pd.Series(dictId)
    fname = "dictIdTest_" + str(idx*200000) + "-" + str((idx+1)*200000) + ".csv"
    dictId.to_csv(fname)
    invdictId = pd.Series(invdictId)
    fname = "invdictIdTest_" + str(idx*200000) + "-" + str((idx+1)*200000) + ".csv"
    invdictId.to_csv(fname)

In [46]:
dictId = {qid:qidnum for qid,qidnum in zip(IdTest[2000000:], range(len(Id[2000000:])))}
invdictId = {qidnum:qid for qid,qidnum in zip(IdTest[2000000:], range(len(Id[2000000:])))}
dictId = pd.Series(dictId)
fname = "dictIdTest_2000000-2013337.csv"
dictId.to_csv(fname)
invdictId = pd.Series(invdictId)
fname = "invdictIdTest_2000000-2013337.csv"
invdictId.to_csv(fname)

In [ ]:
lemmatizer = WordNetLemmatizer()
reader = pd.read_csv("Test.csv", chunksize=100000)
timeStart = time.time()
count=1
dictId = getDict("dictIdTest_0-200000.csv")
testQWordsBody_lil = lil_matrix( (len(dictId), len(dictWordsBody)) )
testQWordsTitle_lil = lil_matrix( (len(dictId), len(dictWordsTitle)) )
for chunk in reader:
    for Id,title,body in zip(chunk['Id'],chunk['Title'],chunk['Body']):
        titlewords = tokenizeWords(title)
        settitle = set(titlewords)
        iterWords = list(w for w in settitle if w in dictWordsTitle.keys())
        for word in iterWords:
            testQWordsTitle_lil[dictId[str(Id)],dictWordsTitle[word]] += titlewords.count(word)
        bodywords = tokenizeWords(body)
        setbody = set(bodywords)
        iterWords = list(w for w in setbody if w in dictWordsBody.keys())
        for word in iterWords:
            testQWordsBody_lil[dictId[str(Id)],dictWordsBody[word]] += bodywords.count(word)
        if count % 100000 == 0:
            print("entry {0:d} finished".format(count))
            print("time for 100000 loops: {0:.0f}s".format(time.time() - timeStart))
            timeStart = time.time()
        if count % 200000 == 0:
            fname = "testWordsQTitle_" + str(count-200000) + "-" + str(count) + ".mtx"
            mmwrite(fname, testQWordsTitle_lil)
            fname = "testWordsQBody_" + str(count-200000) + "-" + str(count) + ".mtx"
            mmwrite(fname, testQWordsBody_lil)
            testQWordsBody_lil = lil_matrix( (200000, len(dictWordsBody)) )
            testQWordsTitle_lil = lil_matrix( (200000, len(dictWordsTitle)) )
            print("files saved")
            fname = "dictIdTest_" + str(count) + "-" + str(count+200000) + ".csv"
            dictId = getDict(fname)
        count+=1

In [26]:
lemmatizer = WordNetLemmatizer()
reader = pd.read_csv("Test.csv", chunksize=100000)
timeStart = time.time()
count=1
dictId = getDict("dictIdTest_2000000-2013337.csv")
testQWordsBody_lil = lil_matrix( (len(dictId), len(dictWordsBody)) )
testQWordsTitle_lil = lil_matrix( (len(dictId), len(dictWordsTitle)) )
for chunk in reader:
    for Id,title,body in zip(chunk['Id'],chunk['Title'],chunk['Body']):
        if count > 2000000:
            if count == 2000001:
                print("start of evaluation")
            titlewords = tokenizeWords(title)
            settitle = set(titlewords)
            iterWords = list(w for w in settitle if w in dictWordsTitle.keys())
            for word in iterWords:
                testQWordsTitle_lil[dictId[str(Id)],dictWordsTitle[word]] += titlewords.count(word)
            bodywords = tokenizeWords(body)
            setbody = set(bodywords)
            iterWords = list(w for w in setbody if w in dictWordsBody.keys())
            for word in iterWords:
                testQWordsBody_lil[dictId[str(Id)],dictWordsBody[word]] += bodywords.count(word)
        count+=1
fname = "testWordsQTitle_2000000-2013337.mtx"
mmwrite(fname, testQWordsTitle_lil)
fname = "testWordsQBody_2000000-2013337.mtx"
mmwrite(fname, testQWordsBody_lil)


start of evaluation